March 8, 2021
#@title
import pandas as pd
import altair as alt
from vega_datasets import data
from google.colab import auth
auth.authenticate_user()
# Turn off the three-dot menu for Altair/Vega charts.
alt.renderers.set_embed_options(actions=False)
#%load_ext google.colab.data_table
#@title
# How to update the data:
# 0. You may need to copy this colab so you have your own version.
# 1. Update the cdc_table to have the latest data's suffix.
# 2. Update the date variables below to be the last case date included in the data.
# If the CRDT doesn't have data on that exact date, choose the closest date for crdt_date.
# 3. If the last case date is after Feb 2, 2021, you'll need to upload a new version
# of the crdt data to compare against and change the crdt_table name below.
# 4. Scatterplot max/min below in chart settings may need to be updated for more cases.
# 5. There are a few checks for the county_fips_mapping that we created due to issues with the CDC's.
# Instructions are at https://docs.google.com/spreadsheets/d/1AVSSge7BpkbNL4PfumUZpL7hokMLjKUojtamQjNW6f0/edit?resourcekey=0-Abdprx3fy_pXikSCDV2hxw#gid=967935006.
# 6. Many/all of the tables and text are not auto-updated. If you want to do a full updated of
# the paper including text and tables, a lot of that is done in commented out PrintSummaryStats().
project_id = 'msm-secure-data-1b'
cdc_table = '`%s.ndunlap_secure.cdc_restricted_access_20210228`' % project_id
crdt_table = '`%s.ndunlap_secure.crdt_20210305`' % project_id
date = 'DATE(2021, 02, 13)'
crdt_date = '20210214'
date_display_name = 'Feb 13'
# Provisional deaths data dates.
provisional_date = 'DATE(2021, 3, 3)'
crdt_provisional_date = 20210303
provisional_date_display_name = 'Mar 3'
provisional_deaths_state_table = '`msm-secure-data-1b.ndunlap_secure.cdc_provisional_deaths_state_20210303`'
provisional_deaths_county_table = '`msm-secure-data-1b.ndunlap_secure.cdc_provisional_deaths_county_20210303`'
# Set the scatterplot max/min to better handle outliers (CA, Los Angeles).
total_cases_scale_max = 55000
county_cases_scale_max = 22000
county_cases_zoom_scale_max = 3000
cases_known_scale_max = 50000 # known race/ethnicity
# Chart settings.
scatter_height = 300
scatter_width = 300
map_height = 300
map_width = 450
us_states = alt.topo_feature(data.us_10m.url, 'states')
us_counties = alt.topo_feature(data.us_10m.url+"#", 'counties')
territories = ('PR', 'GU', 'VI', 'MP', 'AS')
nyt_territories = ('Puerto Rico', 'Guam', 'Virgin Islands', 'Northern Mariana Islands', 'American Samoa')
states_to_fips = {'AL': 1, 'AK': 2, 'AZ': 4, 'AR': 5, 'AS': 3, 'CA': 6, 'CO': 8, 'CT': 9, 'DC': 11, 'DE': 10, 'FL': 12, 'GA': 13, 'GU': 14, 'HI': 15, 'ID': 16, 'IL': 17, 'IN': 18, 'IA': 19, 'KS': 20, 'KY': 21, 'LA': 22, 'ME': 23, 'MD': 24, 'MA': 25, 'MI': 26, 'MN': 27, 'MS': 28, 'MO': 29, 'MT': 30, 'NE': 31, 'NV': 32, 'NH': 33, 'NJ': 34, 'NM': 35, 'NY': 36, 'NYC': 36, 'NC': 37, 'ND': 38, 'OH': 39, 'OK': 40, 'OR': 41, 'PA': 42, 'PR': 43, 'RI': 44, 'SC': 45, 'SD': 46, 'TN': 47, 'TX': 48, 'UT': 49, 'VT': 50, 'VA': 51, 'VI': 52, 'WA': 53, 'WV': 54, 'WI': 55, 'WY': 56, 'AS': 60, 'GU': 66, 'MP': 69, 'PR': 72, 'VI': 78}
states_to_abbreviations = {'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR', 'California': 'CA', 'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE', 'District of Columbia': 'DC', 'Florida': 'FL', 'Georgia': 'GA', 'Hawaii': 'HI', 'Idaho': 'ID', 'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA', 'Kansas': 'KS', 'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD', 'Massachusetts': 'MA', 'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS', 'Missouri': 'MO', 'Montana': 'MT', 'Nebraska': 'NE', 'Nevada': 'NV', 'New Hampshire': 'NH', 'New Jersey': 'NJ', 'New Mexico': 'NM', 'New York': 'NY', 'North Carolina': 'NC', 'North Dakota': 'ND', 'Ohio': 'OH', 'Oklahoma': 'OK', 'Oregon': 'OR', 'Pennsylvania': 'PA', 'Rhode Island': 'RI', 'South Carolina': 'SC', 'South Dakota': 'SD', 'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT', 'Vermont': 'VT', 'Virginia': 'VA', 'Washington': 'WA', 'West Virginia': 'WV', 'Wisconsin': 'WI', 'Wyoming': 'WY'}
race_ethnicity_combined_map = {
'Asian, Non-Hispanic': 'asian_cases',
'Black, Non-Hispanic': 'black_cases',
'White, Non-Hispanic': 'white_cases',
'American Indian/Alaska Native, Non-Hispanic': 'aian_cases',
'Hispanic/Latino': 'hispanic_cases',
'Multiple/Other, Non-Hispanic': 'other_cases',
'Native Hawaiian/Other Pacific Islander, Non-Hispanic': 'nhpi_cases',
'Missing': 'unknown_cases',
'Unknown': 'unknown_cases',
'NA': 'na_cases',
}
#@title
crdt_query = ('''
SELECT
State as state,
Deaths_Total as crdt_cases,
Deaths_Total - Deaths_Unknown as crdt_known_race_cases,
ROUND(1 - Deaths_Unknown / Deaths_Total, 4) as crdt_known_race_cases_percent,
FROM %s
WHERE
date = %s
''' % (crdt_table, crdt_date))
nyt_states_query = ('''
SELECT
state_name,
state_fips_code,
deaths as nyt_cases,
deaths as nyt_deaths
FROM `bigquery-public-data.covid19_nyt.us_states`
WHERE
date = %s AND
state_fips_code IS NOT NULL
''' % date)
nyt_counties_query = ('''
SELECT
county_fips_code,
deaths as nyt_cases,
FROM `bigquery-public-data.covid19_nyt.us_counties`
WHERE
date = %s AND
county_fips_code IS NOT NULL
''' % date)
cdc_states_query = ('''
SELECT
res_state,
COUNT(*) as cdc_cases
FROM
%s
WHERE
death_yn = 'Yes'
GROUP BY
res_state
''' % cdc_table)
cdc_counties_query = ('''
SELECT
res_state,
res_county,
race_ethnicity_combined,
COUNT(*) as cases
FROM
%s
WHERE
death_yn = 'Yes'
GROUP BY
res_county,
res_state,
race_ethnicity_combined
''' % cdc_table)
compare_cases_unknowns_query = ('''
SELECT
res_state,
race_ethnicity_combined,
COUNT(*) as cdc_cases
FROM
%s
WHERE
death_yn = 'Yes'
GROUP BY
res_state,
race_ethnicity_combined
''' % cdc_table)
cdc_states_by_month_query = ('''
SELECT
res_state,
CONCAT(EXTRACT(YEAR from cdc_case_earliest_dt), '-Q', EXTRACT(QUARTER from cdc_case_earliest_dt)) as date,
COUNT(*) as total_cases,
FROM
%s
WHERE
death_yn = 'Yes' AND
cdc_case_earliest_dt >= DATE(2020, 1, 1) AND
cdc_case_earliest_dt < DATE(2021, 1, 1) AND
res_state in ('AK', 'CA', 'CT', 'DE', 'GA', 'LA', 'MD', 'ND', 'NY', 'PA', 'RI')
GROUP BY
1, 2
ORDER BY
1, 2
''' % cdc_table)
cdc_states_by_month_known_or_na_query = ('''
SELECT
res_state,
CONCAT(EXTRACT(YEAR from cdc_case_earliest_dt), '-Q', EXTRACT(QUARTER from cdc_case_earliest_dt)) as date,
COUNT(*) as known_or_na_cases,
FROM
%s
WHERE
death_yn = 'Yes' AND
cdc_case_earliest_dt >= DATE(2020, 1, 1) AND
cdc_case_earliest_dt < DATE(2021, 1, 1) AND
race_ethnicity_combined != 'Unknown' AND
race_ethnicity_combined != 'Missing'
GROUP BY
1, 2
ORDER BY
1, 2
''' % cdc_table)
cdc_overall_query = ('''
SELECT
race_ethnicity_combined,
COUNT(*) as cases
FROM
%s
WHERE
death_yn = 'Yes'
GROUP BY
1
''' % cdc_table)
cdc_provisional_deaths_state_overall_query = ('''
SELECT
*
FROM
%s
WHERE
Indicator = "Distribution of COVID-19 deaths (%%)" AND
State = 'United States'
''' % provisional_deaths_state_table)
cdc_provisional_deaths_state_counts_overall_query = ('''
SELECT
*
FROM
%s
WHERE
Indicator = "Count of COVID-19 deaths" AND
State = 'United States'
''' % provisional_deaths_state_table)
county_fips_mapping_query = ('''
SELECT
*
FROM
`msm-secure-data-1b.ndunlap_secure.county_fips_mapping`
''')
acs_population_data_query = ('''
SELECT
state,
county,
county_fips,
total_pop
FROM
`msm-internal-data.ipums_acs.acs_2019_5year_county`
''')
#@title
# Provisional deaths data queries.
cdc_provisional_deaths_county_query = ('''
SELECT
*
FROM %s
WHERE
Indicator = "Distribution of COVID-19 deaths (%%)"
''' % provisional_deaths_county_table)
cdc_provisional_deaths_state_query = ('''
SELECT
*
FROM %s
WHERE
Indicator = "Distribution of COVID-19 deaths (%%)" AND
State != 'United States'
''' % provisional_deaths_state_table)
cdc_provisional_deaths_state_counts_query = ('''
SELECT
*
FROM %s
WHERE
Indicator = "Count of COVID-19 deaths" AND
State != 'United States'
''' % provisional_deaths_state_table)
crdt_provisional_query = ('''
SELECT
State as state,
Deaths_Total as crdt_cases,
Deaths_Total - Deaths_Unknown as crdt_known_race_cases,
ROUND(1 - Deaths_Unknown / Deaths_Total, 4) as crdt_known_race_cases_percent,
FROM %s
WHERE
date = %s
''' % (crdt_table, crdt_provisional_date))
nyt_counties_provisional_query = ('''
SELECT
county_fips_code,
deaths as nyt_cases,
FROM `bigquery-public-data.covid19_nyt.us_counties`
WHERE
date = %s AND
county_fips_code IS NOT NULL
''' % provisional_date)
#@title
def FieldAnalysis(project_id, table, field_list):
dict = {}
for field in field_list:
dict[field] = [0.0, 0.0, 0.0, 0.0]
unknowns = pd.DataFrame(dict, index=['Unknown', 'Missing', 'NA', 'Known'])
field_series = []
value_series = []
percent_series = []
for field in field_list:
field_unknowns_query = ('''
SELECT
%s,
count(*) as cases
FROM
%s
WHERE
death_yn = 'Yes'
GROUP BY
%s
''')
query = field_unknowns_query % (field, table, field)
field_unknowns_df = pd.io.gbq.read_gbq(query, project_id=project_id)
field_unknowns_df.set_index(field, inplace=True)
field_unknowns_df.index = field_unknowns_df.index.fillna('Null')
field_display_name = {
'cdc_case_earliest_dt': 'CDC earliest case date',
'current_status': 'Case status',
'res_state': 'State',
'res_county': 'County',
'sex': 'Sex',
'age_group': 'Age',
'race_ethnicity_combined': 'Race/Ethnicity'}
missing_count = 0
if 'Missing' in field_unknowns_df.index:
missing_count += field_unknowns_df.loc['Missing'].cases
if 'Null' in field_unknowns_df.index:
missing_count += field_unknowns_df.loc['Null'].cases
if '' in field_unknowns_df.index:
missing_count += field_unknowns_df.loc[''].cases
#if field_unknowns_df.index.isnull().any():
# missing_count += field_unknowns_df.loc[field_unknowns_df.index.isnull()].cases
unknowns.loc['Missing', field] = missing_count / field_unknowns_df.cases.sum()
if 'Unknown' in field_unknowns_df.index:
unknowns.loc['Unknown', field] = field_unknowns_df.loc['Unknown'].cases / field_unknowns_df.cases.sum()
if 'NA' in field_unknowns_df.index:
unknowns.loc['NA', field] = field_unknowns_df.loc['NA'].cases / field_unknowns_df.cases.sum()
unknowns.loc['Known', field] = 1 - (unknowns.loc['Missing', field] +
unknowns.loc['Unknown', field] +
unknowns.loc['NA', field])
field_series.extend([field_display_name.get(field, field)] * 4)
value_series.extend(['Known', 'Supressed', 'Unknown', 'Missing'])
percent_series.extend([unknowns.loc['Known', field],
unknowns.loc['NA', field],
unknowns.loc['Unknown', field],
unknowns.loc['Missing', field]])
test = pd.DataFrame.from_dict({'field': field_series,
'value': value_series,
'percent': percent_series})
return alt.Chart(test).mark_bar().encode(
x=alt.X('percent', axis=alt.Axis(format='%'), title=''),
y=alt.Y('field', sort='x', title='Field'),
color=alt.Color('value', scale=alt.Scale(scheme='category20'), title='Value'),
order=alt.Order('field:N'),
tooltip=[
alt.Tooltip('field:N', title='Field'),
alt.Tooltip('value:N', title='Value'),
alt.Tooltip('percent:Q', format=',.0%', title='Percent'),
]
)
This analysis picks up where the case data completeness analysis left off to evaluate COVID-19 deaths data with race/ethnicity at the state and county levels.
While we only have two options for COVID-19 case data with race/ethnicity, we have four options for COVID-19 deaths data with race/ethnicity:
In the case data completeness analysis, we compared the CDC Case Surveillance dataset to the Covid Racial Data Tracker (CRDT). In this analysis, we'll compare the CDC Case Surveillance dataset to the CRDT and the NCHS datasets. We will not analyze the American Public Media Research Lab dataset because they use a mixture of data from state public health websites for most states, as the CRDT does, and NCHS data in 14 states where the data are unavailable or less complete than on public health websites.
There are completeness issues with using the CDC case dataset for deaths:
For the 279K deaths where we do know race/ethnicity, we can see the following disparities across race/ethnicity groups:
#@title
overall_df = pd.io.gbq.read_gbq(cdc_overall_query, project_id=project_id)
overall_df['race_ethnicity_combined'] = overall_df.race_ethnicity_combined.astype('string').str.strip()
overall_df = overall_df.replace(to_replace={'race_ethnicity_combined': race_ethnicity_combined_map})
overall_df = overall_df.set_index('race_ethnicity_combined')
chart_denominator = 1000
cases_list = [overall_df.cases['hispanic_cases'] / chart_denominator,
overall_df.cases['black_cases'] / chart_denominator,
overall_df.cases['white_cases'] / chart_denominator,
overall_df.cases['asian_cases'] / chart_denominator,
overall_df.cases['nhpi_cases'] / chart_denominator,
overall_df.cases['aian_cases'] / chart_denominator,
overall_df.cases.sum() / chart_denominator,
]
# Population data from https://api.census.gov/data/2019/acs/acs1/profile?get=NAME,DP05_0071E,DP05_0078E,DP05_0077E,DP05_0080E,DP05_0081E,DP05_0079E,DP05_0070E&for=us:1
pop_list = [
60481746 / chart_denominator,
40596040 / chart_denominator,
196789401 / chart_denominator,
18427914 / chart_denominator,
565473 / chart_denominator,
2236348 / chart_denominator,
328239523 / chart_denominator,
]
percent_list = []
for i in range(len(cases_list)):
percent_list.append(cases_list[i] / pop_list[i])
prevalence = pd.DataFrame.from_dict({'group': [
'Hispanic/Latino',
'Black',
'White',
'Asian',
'Native Hawaiian/Pacific Islander',
'American Indian/Alaska Native',
'*Total Including Unknowns*',
], 'percent': percent_list,
'cases': cases_list,
'population': pop_list,
})
bars = alt.Chart(prevalence).mark_bar().encode(
x=alt.X('percent', axis=alt.Axis(format='.2%'), scale=alt.Scale(domain=(0, .0026)), title=''),
y=alt.Y('group', sort='-x', title=''),
color=alt.Color('group',
scale=alt.Scale(scheme='tableau20'),
title='',
legend=None),
tooltip=[
alt.Tooltip('group:N', title='Race/Ethnicity Group'),
alt.Tooltip('percent:Q', format='.3%', title='Percent who died'),
alt.Tooltip('cases:Q', format=',.2f', title='Deaths in group (thousands)'),
alt.Tooltip('population:Q', format=',.0f', title='Population of group (thousands)'),
]
).properties(
title='Percent of Race/Ethnicity Group who died from COVID-19 based on incomplete CDC Case Data as of %s' % date_display_name)
bars.display()
#alt.concat(bars).properties(
# title=alt.TitleParams(
# ['Source: U.S. Census Bureau\'s American Community Survey 2019 5-year estimates for population data.'],
# baseline='bottom',
# dy=20,
# orient='bottom',
# fontWeight='normal',
# fontSize=11
# )
#).display()
But the chart above is based on incomplete data. For example, the CDC dataset says that 0% of deaths in California were Hispanic/Latino people, whereas the California public health website reports that Hispanics/Latinos made up 46% of deaths (21K people) as of February 17.
If all 93K deaths with missing race/ethnicity were Hispanic/Latino people, the percent of Hispanic/Latinos in the U.S. who died from COVID-19 would go from 0.05% to 0.21% — a 4x increase. If all 93K deaths with missing race/ethnicity were Black people, the percent of Black people who died from COVID-19 would go from 0.10% to 0.33% — a 3x increase. While these extreme scenarios are unlikely, they show us why missing race/ethnicity data is preventing us from truly understanding and addressing the disparities in the COVID-19 pandemic in the U.S.
We can get a more complete view of the deaths within each race/ethnicity group from the NCHS state dataset, which contains 97% of the deaths in the Covid Tracking Project up to March 3, where 99% of those deaths have race/ethnicity information.
#@title
overall_provisional_df = pd.io.gbq.read_gbq(cdc_provisional_deaths_state_overall_query, project_id=project_id)
overall_provisional_df
overall_provisional_counts_df = pd.io.gbq.read_gbq(cdc_provisional_deaths_state_counts_overall_query, project_id=project_id)
overall_provisional_counts_df.head(15)
overall_provisional_df['cdc_known_cases_percent'] = round((
overall_provisional_df.Non_Hispanic_White.fillna(0) +
overall_provisional_df.Non_Hispanic_Black_or_African_American.fillna(0) +
overall_provisional_df.Non_Hispanic_American_Indian_or_Alaska_Native.fillna(0) +
overall_provisional_df.Non_Hispanic_Asian.fillna(0) +
overall_provisional_df.Non_Hispanic_Native_Hawaiian_or_Other_Pacific_Islander.fillna(0) +
overall_provisional_df.Hispanic_or_Latino.fillna(0)) / 100, 4)
overall_provisional_counts_df['cdc_known_cases'] = round(
overall_provisional_counts_df.Non_Hispanic_White.fillna(0) +
overall_provisional_counts_df.Non_Hispanic_Black_or_African_American.fillna(0) +
overall_provisional_counts_df.Non_Hispanic_American_Indian_or_Alaska_Native.fillna(0) +
overall_provisional_counts_df.Non_Hispanic_Asian.fillna(0) +
overall_provisional_counts_df.Non_Hispanic_Native_Hawaiian_or_Other_Pacific_Islander.fillna(0) +
overall_provisional_counts_df.Hispanic_or_Latino.fillna(0))
# Extrapolate from the % known race/ethnicity cases and their counts to the total case count for the state.
total_cases_including_unknowns = round(
overall_provisional_counts_df.cdc_known_cases[0] / overall_provisional_df.cdc_known_cases_percent[0], 0)
chart_denominator = 1000
cases_list = [overall_provisional_counts_df.Hispanic_or_Latino[0] / chart_denominator,
overall_provisional_counts_df.Non_Hispanic_Black_or_African_American[0] / chart_denominator,
overall_provisional_counts_df.Non_Hispanic_White[0] / chart_denominator,
overall_provisional_counts_df.Non_Hispanic_Asian[0] / chart_denominator,
overall_provisional_counts_df.Non_Hispanic_Native_Hawaiian_or_Other_Pacific_Islander[0] / chart_denominator,
overall_provisional_counts_df.Non_Hispanic_American_Indian_or_Alaska_Native[0] / chart_denominator,
total_cases_including_unknowns / chart_denominator,
]
# Population data from https://api.census.gov/data/2019/acs/acs1/profile?get=NAME,DP05_0071E,DP05_0078E,DP05_0077E,DP05_0080E,DP05_0081E,DP05_0079E,DP05_0070E&for=us:1
pop_list = [
60481746 / chart_denominator,
40596040 / chart_denominator,
196789401 / chart_denominator,
18427914 / chart_denominator,
565473 / chart_denominator,
2236348 / chart_denominator,
328239523 / chart_denominator,
]
percent_list = []
for i in range(len(cases_list)):
percent_list.append(cases_list[i] / pop_list[i])
prevalence = pd.DataFrame.from_dict({'group': [
'Hispanic/Latino',
'Black',
'White',
'Asian',
'Native Hawaiian/Pacific Islander',
'American Indian/Alaska Native',
'*Total Including Unknowns*',
], 'percent': percent_list,
'cases': cases_list,
'population': pop_list,
})
bars = alt.Chart(prevalence).mark_bar().encode(
x=alt.X('percent', axis=alt.Axis(format='.2%'), title=''),
y=alt.Y('group', sort='-x', title=''),
color=alt.Color('group',
scale=alt.Scale(scheme='tableau20'),
title='',
legend=None),
tooltip=[
alt.Tooltip('group:N', title='Race/Ethnicity Group'),
alt.Tooltip('percent:Q', format='.3%', title='Percent who died'),
alt.Tooltip('cases:Q', format=',.2f', title='Deaths in group (thousands)'),
alt.Tooltip('population:Q', format=',.0f', title='Population of group (thousands)'),
]
).properties(
title='Percent of Race/Ethnicity Group who died from COVID-19 based on more complete NCHS Data as of %s' % provisional_date_display_name
)
bars.display()
We can see that all of the percentages are larger due to having more complete case counts, more cases with known race/ethnicity, and two more weeks of data than in the CDC case data. In these results, 0.150% of Hispanics/Latinos have died from COVID-19, while 0.177% of Black people have died from COVID-19. That translates to 150 deaths out of every 100K Hispanic/Latino people and 177 deaths out of every 100K Black people. The group with the highest rate of deaths is American Indian/Alaska Native with 255 deaths per 100K people. These results are more similar to the Covid Racial Data Tracker and American Public Media Research Lab than the CDC case data.
The goal of this analysis is to assess the completeness of the CDC's Case Surveillance Restricted Access dataset and the NCHS datasets to evaluate their feasibility in examining disparities in race/ethnicity for COVID-19 deaths at the state and county levels.
The top-level data completeness findings for the CDC case dataset are:
#@title
field_list = ['cdc_case_earliest_dt', 'current_status', 'res_state', 'res_county', 'sex', 'age_group', 'race_ethnicity_combined']
FieldAnalysis(project_id, cdc_table, field_list).display()
We will also evaluate a better source for deaths data that comes from death certificates, as reported in the NCHS state dataset, the NCHS county dataset, and on this CDC dashboard. The NCHS state dataset contains 97% of the total deaths in the Covid Tracking Project and 99% of the deaths have race/ethnicity.
We used a composite measurement to evaluate the CDC case dataset and NCHS datasets' completeness as compared to the Covid Racial Data Tracker (CRDT) at the state level. We calculated the percent of total deaths that have race/ethnicity data and broke it down into its two separate components: the percentage of expected death counts included and the percentage of deaths included with race/ethnicity. We looked at the number of states that had at least 50% of total death counts with race/ethnicity and those that had at least 85%.
#@title
# Manually update these fields based on the latest CDC data.
# SELECT
# count(*) as count
# FROM `msm-secure-data-1b.ndunlap_secure.cdc_restricted_access_20210131`
# WHERE death_yn = 'Yes'
# AND race_ethnicity_combined NOT IN ('Unknown', 'NA', 'Missing')
# https://covidtracking.com/data/national
# County data calculated in Counties: CDC vs. NYT section.
row_names = [
'% of CTP deaths count',
'% with race/ethnicity',
'Composite % of total with race/ethnicity',
'Number of states with composite > 85%',
'(as a percent of all states)',
'Number of states with composite > 50%',
'(as a percent of all states)',
]
crdt_metadata = [
'100%',
'93%',
'93%',
'45',
'(88%)',
'50',
'(98%)',
]
cdc_metadata = [
'78%',
'75%',
'58%',
'8',
'(17%)',
'32',
'(67%)',
]
cdc_provisional_metadata = [
'97%',
'99%',
'96%',
'44',
'(86%)',
'50',
'(98%)',
]
table_data = {'CRDT': crdt_metadata, 'CDC': cdc_metadata, 'NCHS': cdc_provisional_metadata}
metadata_df = pd.DataFrame(table_data, index=row_names)
metadata_df.head(15)
The NCHS dataset offers an excellent alternative to the case dataset at the state level. At the county level, the NCHS dataset only includes counties with 100 or more deaths, so it contains fewer counties than the case data, but those counties account for a larger percentage of the U.S. population. We will discuss the tradeoffs for county-level datasets in more detail later on in this analysis.
What we didn't include in this report:
To get a baseline of how much we could expect the CDC death counts to match the CRDT or NYT, we can see how closely the CRDT and NYT match each other. Each dot below is a state (hover to see details), and the black line shows where the NYT and CRDT death counts are equal.
#@title
def CreateScatterPlot(
chart_df, fields_dict, title, scale_max, height, width, geo, metric_type):
geo_field = 'state'
geo_field_display_name = 'State'
if geo == 'county':
geo_field = 'state_county'
geo_field_display_name = 'County'
if metric_type == 'ratio':
scale_scheme = 'blueorange'
scale_reverse = True
scale_domain = [0, 2]
legend_format = '.1f'
axis_format = ',.0f'
elif metric_type == 'percent':
scale_scheme = 'redyellowblue'
scale_reverse = False
scale_domain = [0, 1]
legend_format = '.0%'
axis_format = '.0%'
tooltips = [alt.Tooltip(geo_field + ':N', title=geo_field_display_name)]
for field in ('y', 'x', 'percent'):
tooltips.append(alt.Tooltip(
fields_dict[field]['name'] + ':Q',
format=fields_dict[field]['format'],
title=fields_dict[field]['title'],
))
plot = alt.Chart(chart_df).mark_circle(size=60).encode(
alt.X(fields_dict['x']['name'] + ':Q', axis=alt.Axis(title=fields_dict['x']['title'], format=axis_format),
scale=alt.Scale(domain=(0, scale_max))
),
alt.Y(fields_dict['y']['name'] + ':Q', axis=alt.Axis(title=fields_dict['y']['title'], format=axis_format),
scale=alt.Scale(domain=(0, scale_max))
),
color=alt.Color(fields_dict['percent']['name'],
type='quantitative',
scale=alt.Scale(scheme=scale_scheme,
reverse=scale_reverse,
domain=scale_domain,
clamp=True),
legend=alt.Legend(format=legend_format),
title=metric_type.capitalize()),
tooltip=tooltips,
).properties(
height=height,
width=width,
)
if metric_type == 'ratio':
plot.interactive()
line = pd.DataFrame({
'x': [0, scale_max],
'y': [0, scale_max],
})
if metric_type == 'ratio':
line_plot = alt.Chart(line).mark_line(color='black').encode(
x='x',
y='y',
)
elif metric_type == 'percent':
line_plot = (
alt.Chart(pd.DataFrame({'x': [.5]})).mark_rule().encode(y='x') +
alt.Chart(pd.DataFrame({'y': [.5]})).mark_rule().encode(x='y')
)
# Add interative for concatenating due to https://github.com/altair-viz/altair/issues/2010.
scatter = (plot + line_plot).properties(
title=title,
height=height,
width=width,
).interactive()
return scatter
def CreateMap(
chart_df, fields_dict, title, scale_max, height, width, geo, metric_type):
geo_field = 'state'
geo_field_display_name = 'State'
fips_code = 'state_fips_code'
topo_feature = us_states
if geo == 'county':
geo_field = 'state_county'
geo_field_display_name = 'County'
fips_code = 'county_fips'
topo_feature = us_counties
if metric_type == 'ratio':
scale_scheme = 'blueorange'
scale_reverse = True
scale_domain = [0, 2]
legend_format = '.1f'
elif metric_type == 'percent':
scale_scheme = 'redyellowblue'
scale_reverse = False
scale_domain = [0, 1]
legend_format = '.0%'
highlight = alt.selection_single(on='mouseover', fields=['id', fips_code], empty='none')
tooltips = [alt.Tooltip(geo_field + ':N', title=geo_field_display_name)]
for field in ('y', 'x', 'percent'):
tooltips.append(alt.Tooltip(
fields_dict[field]['name'] + ':Q',
format=fields_dict[field]['format'],
title=fields_dict[field]['title'],
))
field_names = [geo_field]
field_names.extend([fields_dict[field]['name'] for field in fields_dict])
plot = alt.Chart(topo_feature).mark_geoshape(
stroke='white',
strokeOpacity=.2,
strokeWidth=1
).project(
type='albersUsa'
).transform_lookup(
lookup='id',
from_=alt.LookupData(chart_df, fips_code, field_names)
).encode(
alt.Color(fields_dict['percent']['name'],
type='quantitative',
legend=alt.Legend(format=legend_format),
scale=alt.Scale(scheme=scale_scheme,
reverse=scale_reverse,
domain=scale_domain,
clamp=True,
),
title=metric_type.capitalize()),
tooltip=tooltips
).add_selection(
highlight,
)
states_outline = alt.Chart(us_states).mark_geoshape(stroke='white', strokeWidth=1.5, fillOpacity=0, fill='white').project(
type='albersUsa'
)
states_fill = alt.Chart(us_states).mark_geoshape(
fill='silver',
stroke='white'
).project('albersUsa')
layered_map = alt.layer(states_fill, plot, states_outline).properties(
height=height,
width=width,
title=title,
)
return layered_map
def CreateScatterPlotAndMap(
chart_df, fields_dict, title, total_cases_scale_max, scatter_height, scatter_width, map_width, geo, metric_type):
scatter = CreateScatterPlot(
chart_df, fields_dict, title, total_cases_scale_max, scatter_height, scatter_width, geo, metric_type)
map = CreateMap(
chart_df, fields_dict, title, total_cases_scale_max, scatter_height, map_width, geo, metric_type)
return (scatter | map).configure_view(
strokeWidth=0,
).configure_mark(
stroke='grey'
).configure_legend(
gradientLength=scatter_height - 50
)
def PrintSummaryStats(chart_df, field='percent'):
plus_minus_15_df = chart_df[chart_df[field] >= .85]
plus_minus_15_df = plus_minus_15_df[plus_minus_15_df[field] <= 1.15]
print('between +/-15%: ', len(plus_minus_15_df), round(len(plus_minus_15_df) / len(chart_df), 2))
plus_minus_50_df = chart_df[chart_df[field] >= .50]
plus_minus_50_df = plus_minus_50_df[plus_minus_50_df[field] <= 1.50]
print('between +/-50%: ', len(plus_minus_50_df), round(len(plus_minus_50_df) / len(chart_df), 2))
print('< than .50: ', len(chart_df[chart_df[field] < .5]))
print('> than 1.50: ', len(chart_df[chart_df[field] > 1.5]))
print(chart_df[field].describe())
#@title
crdt_df = pd.io.gbq.read_gbq(crdt_query, project_id=project_id)
crdt_df.set_index('state', inplace=True)
nyt_states_df = pd.io.gbq.read_gbq(nyt_states_query, project_id=project_id)
nyt_states_df.state_fips_code.unique()
nyt_territories = ('Puerto Rico', 'Guam', 'Virgin Islands', 'Northern Mariana Islands', 'American Samoa')
for territory in nyt_territories:
nyt_states_df = nyt_states_df[nyt_states_df.state_name != territory]
nyt_states_df['state_fips_code'] = nyt_states_df.state_fips_code.astype(int)
nyt_states_df.set_index('state_fips_code', inplace=True)
crdt_df.reset_index(inplace=True)
crdt_df['state_fips_code'] = crdt_df.state
crdt_df = crdt_df.replace(to_replace={'state_fips_code': states_to_fips})
crdt_df.set_index('state_fips_code', inplace=True)
nyt_crdt_merged_df = nyt_states_df.join(crdt_df, on="state_fips_code", how='inner', lsuffix='_left', rsuffix='_right')
nyt_crdt_merged_df['percent'] = round(nyt_crdt_merged_df.nyt_cases / nyt_crdt_merged_df.crdt_cases, 2)
nyt_crdt_merged_df
nyt_crdt_merged_df.reset_index(inplace=True)
below_15 = len(nyt_crdt_merged_df[nyt_crdt_merged_df.percent < .85]) / len(nyt_crdt_merged_df)
above_15 = len(nyt_crdt_merged_df[nyt_crdt_merged_df.percent > 1.15]) / len(nyt_crdt_merged_df)
#print('between +/-15%: ', round(1 - below_15 - above_15, 2))
#nyt_crdt_merged_df.percent.describe()
#@title
nyt_crdt_fields_dict = {
'x': {'name': 'crdt_cases', 'format': ',', 'title': 'CRDT deaths'},
'y': {'name': 'nyt_cases', 'format': ',', 'title': 'NYT deaths'},
'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of NYT to CRDT'},
}
nyt_crdt_title = 'Ratio of NYT to CRDT Deaths by State as of %s' % date_display_name
CreateScatterPlotAndMap(
nyt_crdt_merged_df, nyt_crdt_fields_dict, nyt_crdt_title, total_cases_scale_max, scatter_height, scatter_width, map_width, 'state', 'ratio'
).display()
We can see below that the CDC death counts differ from the CRDT death counts much more drastically than the NYT did.
#@title
cdc_states_df = pd.io.gbq.read_gbq(cdc_states_query, project_id=project_id)
cdc_states_df.rename(columns={'res_state': 'state'}, inplace=True)
cdc_states_df.set_index('state', inplace=True)
crdt_df = pd.io.gbq.read_gbq(crdt_query, project_id=project_id)
for territory in territories:
crdt_df = crdt_df[crdt_df.state != territory]
crdt_df.set_index('state', inplace=True)
cdc_crdt_merged_df = cdc_states_df.join(crdt_df, on="state", how='right', lsuffix='_left', rsuffix='_right')
cdc_crdt_merged_df.reset_index(inplace=True)
cdc_crdt_merged_df['state_fips_code'] = cdc_crdt_merged_df.state
cdc_crdt_merged_df = cdc_crdt_merged_df.replace(to_replace={'state_fips_code': states_to_fips})
cdc_crdt_merged_df['percent'] = round(cdc_crdt_merged_df.cdc_cases / cdc_crdt_merged_df.crdt_cases, 4)
# PrintSummaryStats(cdc_crdt_merged_df)
#@title
# CDC vs. NYT county
df = pd.io.gbq.read_gbq(cdc_counties_query, project_id=project_id)
for territory in territories:
df = df[df.res_state != territory]
df_county_fips_map = pd.io.gbq.read_gbq(county_fips_mapping_query, project_id=project_id)
df_county_fips_map.cdc_county = df_county_fips_map.cdc_county.str.lower()
df_county_fips_map['state_county'] = df_county_fips_map.state + '-' + df_county_fips_map.cdc_county
df_county_fips_map['state_county'] = df_county_fips_map.state_county.astype('string').str.strip()
df_county_fips_map.set_index('state_county', inplace=True)
#@title
# Concatenate the state and county names because county names are not unique across states.
df.res_county = df.res_county.str.lower()
df['state_county'] = df.res_state + '-' + df.res_county
df['state_county'] = df.state_county.astype('string').str.strip()
df.set_index('state_county', inplace=True)
df['race_ethnicity_combined'] = df.race_ethnicity_combined.astype('string').str.strip()
df = df.replace(to_replace={'race_ethnicity_combined': race_ethnicity_combined_map})
#@title
mismatches_df = df.join(df_county_fips_map, on="state_county", how='outer', lsuffix='_left', rsuffix='_right')
mismatches_df = mismatches_df[mismatches_df.county_fips.isna()]
mismatches_df = mismatches_df[mismatches_df.res_state != 'NA']
mismatches_df = mismatches_df[mismatches_df.res_state != 'Unknown']
mismatches_df = mismatches_df[mismatches_df.res_county != 'na']
mismatches_df = mismatches_df[mismatches_df.res_county != 'unknown']
#print(mismatches_df.cases.sum())
#print('vs. 60363 with NULL county_fips_code')
# SELECT
#count(*) as total_cases,
#FROM `msm-secure-data-1b.ndunlap_secure.cdc_restricted_access_20201231`
#WHERE county_fips_code IS NULL
#@title
merged_df = df.join(df_county_fips_map, on="state_county", how='inner', lsuffix='_left', rsuffix='_right')
# Create a crosstab table with rows = counties, columns = race_ethnicity_combined.
crosstab_df = pd.crosstab(merged_df['county_fips'], merged_df.race_ethnicity_combined, values=merged_df.cases, aggfunc=sum,
margins=True,
margins_name='total_cases'
)
# Have to reset_index() to go from pandas multi-index to single index.
crosstab_df = crosstab_df.reset_index()
crosstab_df.drop(axis=0, index=len(crosstab_df) - 1, inplace=True)
crosstab_df['county_fips'] = crosstab_df.county_fips.astype(int)
crosstab_df['total_known_cases'] = crosstab_df['total_cases'] - crosstab_df.na_cases.fillna(0) - crosstab_df.unknown_cases.fillna(0)
#@title
# Get the display names for each county.
# Use ACS data that only has one FIPS code per county unlike the fips_county_map.
df_acs_name_lookup = pd.io.gbq.read_gbq(acs_population_data_query, project_id=project_id)
df_acs_name_lookup['state_county'] = df_acs_name_lookup.county.astype('string').str.strip() + ', ' + df_acs_name_lookup.state.astype('string').str.strip()
df_acs_name_lookup.drop(columns=['state', 'county'], inplace=True)
df_acs_name_lookup.set_index('county_fips', inplace=True)
county_chart_df = crosstab_df.join(df_acs_name_lookup, on="county_fips", how='inner', lsuffix='_left', rsuffix='_right')
county_chart_df.county_fips = county_chart_df.county_fips.astype(int)
#print(len(county_chart_df))
#print(county_chart_df.total_pop.sum())
#print(county_chart_df.total_pop.sum() / 324697795) # Population covered in these counties
#@title
nyt_counties_df = pd.io.gbq.read_gbq(nyt_counties_query, project_id=project_id)
nyt_counties_df.rename(columns={'county_fips_code': 'county_fips'}, inplace=True)
nyt_counties_df.county_fips.unique()
nyt_counties_df['county_fips'] = nyt_counties_df.county_fips.astype(int)
nyt_counties_df.set_index('county_fips', inplace=True)
county_chart_df.set_index('county_fips', inplace=True)
nyt_merged_df = county_chart_df.join(nyt_counties_df, on="county_fips", how='inner', lsuffix='_left', rsuffix='_right')
nyt_merged_df = nyt_merged_df.reset_index()
nyt_merged_df['percent'] = round(nyt_merged_df.total_cases / nyt_merged_df.nyt_cases, 2)
#PrintSummaryStats(nyt_merged_df)
#@title
cdc_crdt_fields_dict = {
'x': {'name': 'crdt_cases', 'format': ',', 'title': 'CRDT deaths'},
'y': {'name': 'cdc_cases', 'format': ',', 'title': 'CDC deaths'},
'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of CDC to CRDT'},
}
cdc_crdt_title = 'Ratio of CDC to CRDT Deaths by State as of %s' % date_display_name
CreateScatterPlotAndMap(
cdc_crdt_merged_df, cdc_crdt_fields_dict, cdc_crdt_title, total_cases_scale_max, scatter_height, scatter_width, map_width, 'state', 'ratio'
).display()
We can do the same analysis at the county level using the CDC vs. NYT data.
#@title
cdc_nyt_fields_dict = {
'x': {'name': 'nyt_cases', 'format': ',', 'title': 'NYT deaths'},
'y': {'name': 'total_cases', 'format': ',', 'title': 'CDC deaths'},
'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of CDC to NYT'},
}
cdc_nyt_title = 'Ratio of CDC to NYT Deaths by County as of %s' % date_display_name
zoom_cdc_nyt_title = 'Zoom in on counties with up to 3,000 Deaths'
scatter = CreateScatterPlot(
nyt_merged_df, cdc_nyt_fields_dict, cdc_nyt_title, county_cases_scale_max, scatter_height, scatter_width, 'county', 'ratio'
)
zoom_scatter = CreateScatterPlot(
nyt_merged_df, cdc_nyt_fields_dict, zoom_cdc_nyt_title, county_cases_zoom_scale_max, scatter_height, scatter_width, 'county', 'ratio'
)
(scatter | zoom_scatter).configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=map_height - 50
).configure_mark(
stroke='grey'
).display()
#@title
cdc_nyt_fields_dict = {
'x': {'name': 'nyt_cases', 'format': ',', 'title': 'NYT deaths'},
'y': {'name': 'total_cases', 'format': ',', 'title': 'CDC deaths'},
'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of CDC to NYT'},
}
cdc_nyt_title = 'Ratio of CDC to NYT Deaths by County as of %s' % date_display_name
cdc_nyt_map = CreateMap(
nyt_merged_df, cdc_nyt_fields_dict, cdc_nyt_title, total_cases_scale_max, map_height, map_width, 'county', 'ratio'
)
cdc_crdt_map = CreateMap(
cdc_crdt_merged_df, cdc_crdt_fields_dict, cdc_crdt_title, total_cases_scale_max, map_height, map_width, 'state', 'ratio'
)
(cdc_crdt_map | cdc_nyt_map).configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=map_height - 50
).display()
Notes:
#@title
states_df = pd.io.gbq.read_gbq(compare_cases_unknowns_query, project_id=project_id)
for state in ('Unknown', 'NA', 'OCONUS'):
states_df = states_df[states_df.res_state != state]
states_df['race_ethnicity_combined'] = states_df.race_ethnicity_combined.astype('string').str.strip()
states_df = states_df.replace(to_replace={'race_ethnicity_combined': {
'Asian, Non-Hispanic': 'cdc_known_cases',
'Black, Non-Hispanic': 'cdc_known_cases',
'White, Non-Hispanic': 'cdc_known_cases',
'American Indian/Alaska Native, Non-Hispanic': 'cdc_known_cases',
'Hispanic/Latino': 'cdc_known_cases',
'Multiple/Other, Non-Hispanic': 'cdc_known_cases',
'Native Hawaiian/Other Pacific Islander, Non-Hispanic': 'cdc_known_cases',
'Missing': 'cdc_unknown_cases',
'Unknown': 'cdc_unknown_cases',
'NA': 'cdc_na_cases',
}})
states_df.rename(columns={'res_state': 'state'}, inplace=True)
#@title
crosstab_df = pd.crosstab(states_df['state'], states_df.race_ethnicity_combined, values=states_df.cdc_cases, aggfunc=sum,
margins=True,
margins_name='cdc_cases'
)
# Have to reset_index() to go from pandas multi-index to single index.
crosstab_df = crosstab_df.reset_index()
crosstab_df.drop(axis=0, index=len(crosstab_df) - 1, inplace=True)
crosstab_df['cdc_known_or_na_cases'] = crosstab_df['cdc_cases'] - crosstab_df.cdc_unknown_cases.fillna(0)
crosstab_df['cdc_known_cases'] = crosstab_df['cdc_cases'] - crosstab_df.cdc_na_cases.fillna(0) - crosstab_df.cdc_unknown_cases.fillna(0)
crosstab_df
crdt_merged_df = crosstab_df.join(crdt_df, on="state", how='inner', lsuffix='_left', rsuffix='_right')
crdt_merged_df.reset_index(inplace=True)
crdt_merged_df['state_fips_code'] = crdt_merged_df.state
crdt_merged_df = crdt_merged_df.replace(to_replace={'state_fips_code': states_to_fips})
crdt_merged_df['cdc_known_cases_percent'] = round(crdt_merged_df.cdc_known_cases / crdt_merged_df.cdc_cases, 4)
crdt_merged_df['cdc_known_or_na_cases_percent'] = round(crdt_merged_df.cdc_known_or_na_cases / crdt_merged_df.cdc_cases, 4)
crdt_merged_df['percent'] = round(crdt_merged_df.cdc_cases / crdt_merged_df.crdt_cases, 4)
crdt_merged_df['percent_known_cases'] = round(crdt_merged_df.cdc_known_cases / crdt_merged_df.crdt_known_race_cases, 4)
crdt_merged_df_no_ny = crdt_merged_df[crdt_merged_df.state != 'NY']
#PrintSummaryStats(crdt_merged_df_no_ny)
How does the CDC dataset compare to the CRDT dataset, which is the most up-to-date aggregate data we have for race/ethnicity at the state level? Overall, 93% of the deaths in the CRDT dataset have known race/ethnicity compared to 75% in the CDC dataset (77% with suppressed).
We may even be undercounting the 93% of deaths with known race/ethnicity in the CRDT data because of the non-standard ways that each state reports on race/ethnicity, as described in this Covid Racial Data Tracker analysis. If a state uses a combined race/ethnicity field, then it's a straightforward comparison to the CDC's combined race/ethnicity field. If a state uses separate fields for race/ethnicity, then we still use the number of people with known race within each state because all of the race categories will also contain Hispanic/Latino people. We could potentially be undercounting the number of people with known race/ethnicity in the CRDT if there are people who have unknown race but known ethnicity. If we adjusted the numbers in those cases, it would make the CRDT percentages look even better in comparison to the CDC data.
#@title
cdc_known_state_fields_dict = {
'x': {'name': 'cdc_known_cases', 'format': ',', 'title': 'Known race/ethnicity deaths'},
'y': {'name': 'cdc_cases', 'format': ',', 'title': 'CDC deaths'},
'percent': {'name': 'cdc_known_cases_percent', 'format': '.0%', 'title': 'Percent known deaths'},
}
cdc_known_state_title = 'CDC Deaths with Known Race/Ethnicity as of %s' % date_display_name
cdc_known_state_map = CreateMap(
crdt_merged_df, cdc_known_state_fields_dict, cdc_known_state_title, total_cases_scale_max, map_height, map_width, 'state', 'percent'
)
crdt_known_state_fields_dict = {
'x': {'name': 'crdt_known_race_cases', 'format': ',', 'title': 'Known race/ethnicity deaths'},
'y': {'name': 'crdt_cases', 'format': ',', 'title': 'CRDT deaths'},
'percent': {'name': 'crdt_known_race_cases_percent', 'format': '.0%', 'title': 'Percent known deaths'},
}
crdt_known_state_title = 'CRDT Deaths with Known Race/Ethnicity as of %s' % date_display_name
crdt_known_map = CreateMap(
cdc_crdt_merged_df, crdt_known_state_fields_dict, crdt_known_state_title, total_cases_scale_max, map_height, map_width, 'state', 'percent'
)
(crdt_known_map | cdc_known_state_map).configure(
padding={"left": 0, "top": 5, "right": 0, "bottom": 5}
).configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=map_height - 50
).display()
#@title
fields_dict = {
'x': {'name': 'crdt_known_race_cases', 'format': ',', 'title': 'CRDT known race/ethnicity deaths'},
'y': {'name': 'cdc_known_cases', 'format': ',', 'title': 'CDC known race/ethnicity deaths'},
'percent': {'name': 'percent_known_cases', 'format': '.2f', 'title': 'Ratio of CDC to CRDT'},
}
title = 'Ratio of CDC to CRDT Deaths with Known Race/Ethnicity as of %s' % date_display_name
CreateScatterPlotAndMap(
crdt_merged_df, fields_dict, title, cases_known_scale_max, scatter_height, scatter_width, map_width - 5, 'state', 'ratio'
).display()
#@title
#print('CRDT known % ', crdt_merged_df.crdt_known_race_cases.sum() / crdt_merged_df.crdt_cases.sum())
#PrintSummaryStats(cdc_crdt_merged_df, field='crdt_known_race_cases_percent')
We can also look at the percentage of cases with known race/ethnicity at the county level alongside the state-level numbers.
#@title
chart_df = county_chart_df.copy(deep=True)
chart_df.reset_index(inplace=True)
chart_df.county_fips = chart_df.county_fips.astype(int)
chart_df['percent_known_cases'] = round(chart_df.total_known_cases / chart_df.total_cases, 2)
chart_df['total_known_or_na_cases'] = chart_df.total_known_cases.fillna(0) + chart_df.na_cases.fillna(0)
chart_df['percent_known_or_na_cases'] = round(chart_df.total_known_or_na_cases / chart_df.total_cases, 2)
#@title
cdc_known_county_fields_dict = {
'x': {'name': 'total_known_cases', 'format': ',', 'title': 'Known race/ethnicity deaths'},
'y': {'name': 'total_cases', 'format': ',', 'title': 'CDC deaths'},
'percent': {'name': 'percent_known_cases', 'format': '.0%', 'title': 'Percent known deaths'},
}
cdc_known_county_title = 'CDC Deaths with Known Race/Ethnicity as of %s' % date_display_name
cdc_known_county_map = CreateMap(
chart_df, cdc_known_county_fields_dict, cdc_known_county_title, total_cases_scale_max, map_height, map_width, 'county', 'percent'
)
(cdc_known_state_map | cdc_known_county_map).configure(
padding={"left": 0, "top": 5, "right": 0, "bottom": 5}
).configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=map_height - 50
).display()
Later on, we'll look at the effect of data suppression on these numbers at the county level, especially counties with small populations.
#@title
#PrintSummaryStats(crdt_merged_df, field='cdc_known_cases_percent')
#PrintSummaryStats(crdt_merged_df, field='cdc_known_or_na_cases_percent')
#tuple(crdt_merged_df[crdt_merged_df.cdc_known_or_na_cases_percent <= .5].state)
Note: A larger version of the county maps for hovering over smaller counties is available in the Appendix.
The CDC has an alternative public source for death data with race/ethnicity at the state and county levels that comes from death certificates via the NCHS. This dataset is more complete than the case surveillance dataset in every way except that the county-level dataset only contains about one fourth of the counties in the CDC case data. The reason for this is that the NCHS county dataset only includes counties with 100 or more deaths. The NCHS state dataset also has a limit of 100 or more deaths per state, but all states had reached that threshold by Mar 3, 2021.
The table below summarizes the completeness of the CDC Case dataset vs. the NCHS datasets for counties and states.
#@title
# Manually update these fields based on the latest CDC data.
row_names = [
'Update frequency',
'Latest deaths date',
'Deaths in dataset as of date',
'Deaths in CTP as of date',
'(as a % of CTP)',
'Number of counties',
'(as a % of all counties)',
'Population in those counties',
'(as a % of total U.S population – States + D.C.)',
'Deaths with known race/ethnicity',
'(as a % of deaths in dataset)',
'Access',
'Limitations'
]
cdc_deaths_county_metadata = [
'Weekly',
'Mar 3, 2021',
'439K',
'509K',
'(86%)',
'712',
'(23%)',
'260K',
'(80%)',
'432K',
'(98%)',
'Public',
'Counties >= 100 deaths'
]
cdc_deaths_state_metadata = [
'Weekly',
'Mar 3, 2021',
'494K',
'509K',
'(97%)',
'-',
'-',
'-',
'-',
'491K',
'(99%)',
'Public',
'-'
]
cdc_cases_metadata = [
'Monthly',
'Feb 13, 2021',
'372K',
'474K',
'(78%)',
'2,391',
'(76%)',
'280M',
'(86%)',
'279K',
'(75%)',
'Restricted',
'Data completeness issues'
]
table_data = {'CDC': cdc_cases_metadata,
'NCHS state': cdc_deaths_state_metadata,
'NCHS county': cdc_deaths_county_metadata,
}
metadata_df = pd.DataFrame(table_data, index=row_names)
metadata_df.head(15)
The NCHS datasets are more complete at the U.S. level on all measures except for the number of counties contained in the county-level data. The American Public Media Research Lab found that the NCHS state dataset was more complete than public health website data for 15 states.
#@title
cdc_provisional_deaths_state_df = pd.io.gbq.read_gbq(cdc_provisional_deaths_state_query, project_id=project_id)
cdc_provisional_deaths_state_df = cdc_provisional_deaths_state_df.replace(
to_replace={'State': states_to_abbreviations})
cdc_provisional_deaths_state_counts_df = pd.io.gbq.read_gbq(cdc_provisional_deaths_state_counts_query, project_id=project_id)
cdc_provisional_deaths_state_counts_df = cdc_provisional_deaths_state_counts_df.replace(
to_replace={'State': states_to_abbreviations})
cdc_provisional_deaths_state_df['cdc_known_cases_percent'] = round((
cdc_provisional_deaths_state_df.Non_Hispanic_White.fillna(0) +
cdc_provisional_deaths_state_df.Non_Hispanic_Black_or_African_American.fillna(0) +
cdc_provisional_deaths_state_df.Non_Hispanic_American_Indian_or_Alaska_Native.fillna(0) +
cdc_provisional_deaths_state_df.Non_Hispanic_Asian.fillna(0) +
cdc_provisional_deaths_state_df.Non_Hispanic_Native_Hawaiian_or_Other_Pacific_Islander.fillna(0) +
cdc_provisional_deaths_state_df.Hispanic_or_Latino.fillna(0)) / 100, 4)
cdc_provisional_deaths_state_counts_df['cdc_known_cases'] = round(
cdc_provisional_deaths_state_counts_df.Non_Hispanic_White.fillna(0) +
cdc_provisional_deaths_state_counts_df.Non_Hispanic_Black_or_African_American.fillna(0) +
cdc_provisional_deaths_state_counts_df.Non_Hispanic_American_Indian_or_Alaska_Native.fillna(0) +
cdc_provisional_deaths_state_counts_df.Non_Hispanic_Asian.fillna(0) +
cdc_provisional_deaths_state_counts_df.Non_Hispanic_Native_Hawaiian_or_Other_Pacific_Islander.fillna(0) +
cdc_provisional_deaths_state_counts_df.Hispanic_or_Latino.fillna(0))
cdc_provisional_deaths_state_df = cdc_provisional_deaths_state_df.set_index('State')
cdc_provisional_deaths_state_counts_df = cdc_provisional_deaths_state_counts_df.set_index('State')
cdc_provisional_deaths_state_df = cdc_provisional_deaths_state_df.join(
cdc_provisional_deaths_state_counts_df, on="State", how='inner', lsuffix='_left', rsuffix='_right')
# Extrapolate from the % known race/ethnicity cases and their counts to the total case count for the state.
cdc_provisional_deaths_state_df['cdc_cases'] = round(
cdc_provisional_deaths_state_df.cdc_known_cases / cdc_provisional_deaths_state_df.cdc_known_cases_percent, 0)
# Comnbine the case counts for NY and New York City, recalculate percentage, remove New York City.
cdc_provisional_deaths_state_df.loc['NY', 'cdc_cases'] = (
cdc_provisional_deaths_state_df.loc['NY', 'cdc_cases'] +
cdc_provisional_deaths_state_df.loc['New York City', 'cdc_cases'])
cdc_provisional_deaths_state_df.loc['NY', 'cdc_known_cases'] = (
cdc_provisional_deaths_state_df.loc['NY', 'cdc_known_cases'] +
cdc_provisional_deaths_state_df.loc['New York City', 'cdc_known_cases'])
cdc_provisional_deaths_state_df.loc['NY', 'cdc_known_cases_percent'] = round(
cdc_provisional_deaths_state_df.loc['NY', 'cdc_known_cases'] /
cdc_provisional_deaths_state_df.loc['NY', 'cdc_cases'], 4)
cdc_provisional_deaths_state_df.drop(['New York City'], inplace=True)
cdc_provisional_deaths_state_df.reset_index(inplace=True)
cdc_provisional_deaths_state_df.rename(columns={'State': 'state'}, inplace=True)
cdc_provisional_deaths_state_df = cdc_provisional_deaths_state_df[
['state', 'cdc_cases', 'cdc_known_cases', 'cdc_known_cases_percent']].copy()
crdt_provisional_df = pd.io.gbq.read_gbq(crdt_provisional_query, project_id=project_id)
crdt_provisional_df.set_index('state', inplace=True)
cdc_provisional_crdt_merged_df = cdc_provisional_deaths_state_df.join(crdt_provisional_df, on="state", how='right', lsuffix='_left', rsuffix='_right')
cdc_provisional_crdt_merged_df.reset_index(inplace=True)
cdc_provisional_crdt_merged_df['state_fips_code'] = cdc_provisional_crdt_merged_df.state
cdc_provisional_crdt_merged_df = cdc_provisional_crdt_merged_df.replace(to_replace={'state_fips_code': states_to_fips})
cdc_provisional_crdt_merged_df['percent'] = round(cdc_provisional_crdt_merged_df.cdc_cases / cdc_provisional_crdt_merged_df.crdt_cases, 4)
cdc_provisional_crdt_merged_df['percent_known_cases'] = round(cdc_provisional_crdt_merged_df.cdc_known_cases / cdc_provisional_crdt_merged_df.crdt_known_race_cases, 4)
# Counties data
df_acs_name_lookup = pd.io.gbq.read_gbq(acs_population_data_query, project_id=project_id)
df_acs_name_lookup.set_index('county_fips', inplace=True)
cdc_provisional_deaths_county_df = pd.io.gbq.read_gbq(cdc_provisional_deaths_county_query, project_id=project_id)
cdc_provisional_deaths_county_df['county_fips'] = cdc_provisional_deaths_county_df.FIPS_Code
cdc_provisional_deaths_county_df.set_index('county_fips', inplace=True)
cdc_provisional_deaths_county_df['state_county'] = cdc_provisional_deaths_county_df.County_Name + ', ' + cdc_provisional_deaths_county_df.State
cdc_provisional_deaths_county_df['total_known_cases'] = round((
cdc_provisional_deaths_county_df.Non_Hispanic_White.fillna(0) +
cdc_provisional_deaths_county_df.Non_Hispanic_Black.fillna(0) +
cdc_provisional_deaths_county_df.Non_Hispanic_American_Indian_or_Alaska_Native.fillna(0) +
cdc_provisional_deaths_county_df.Non_Hispanic_Asian.fillna(0) +
cdc_provisional_deaths_county_df.Non_Hispanic_Native_Hawaiian_or_Other_Pacific_Islander.fillna(0) +
cdc_provisional_deaths_county_df.Hispanic.fillna(0)) * cdc_provisional_deaths_county_df.COVID_19_Deaths, 0)
county_chart_provisional_df = cdc_provisional_deaths_county_df.join(df_acs_name_lookup, on="county_fips", how='inner', lsuffix='_left', rsuffix='_right')
county_chart_provisional_df.reset_index(inplace=True)
county_chart_provisional_df.county_fips = county_chart_provisional_df.county_fips.astype(int)
nyt_counties_provisional_df = pd.io.gbq.read_gbq(nyt_counties_provisional_query, project_id=project_id)
nyt_counties_provisional_df.rename(columns={'county_fips_code': 'county_fips'}, inplace=True)
nyt_counties_provisional_df.county_fips.unique()
nyt_counties_provisional_df['county_fips'] = nyt_counties_provisional_df.county_fips.astype(int)
nyt_counties_provisional_df.set_index('county_fips', inplace=True)
county_chart_provisional_df.set_index('county_fips', inplace=True)
nyt_merged_provisional_df = county_chart_provisional_df.join(nyt_counties_provisional_df, on="county_fips", how='left', lsuffix='_left', rsuffix='_right')
nyt_merged_provisional_df = nyt_merged_provisional_df.reset_index()
nyt_merged_provisional_df['percent'] = round(nyt_merged_provisional_df.COVID_19_Deaths / nyt_merged_provisional_df.nyt_cases, 4)
nyt_merged_provisional_df['percent_known_cases'] = round(nyt_merged_provisional_df.total_known_cases / nyt_merged_provisional_df.COVID_19_Deaths, 4)
#print('total deaths: ', nyt_merged_provisional_df.COVID_19_Deaths.sum())
#print('total counties: ', len(county_chart_provisional_df))
#print('as % of counties: ', len(county_chart_provisional_df) / 3143)
#print('population in counties: ', county_chart_provisional_df.total_pop.sum())
#print('as % of total population: ', county_chart_provisional_df.total_pop.sum() / 324697795) # Population covered in these counties
#print('known race/ethnicity: ', nyt_merged_provisional_df.total_known_cases.sum())
#print('% known race/ethnicity: ', nyt_merged_provisional_df.total_known_cases.sum() / nyt_merged_provisional_df.COVID_19_Deaths.sum())
#print(0.55 * 324697795) # NYT population
#print('state file deaths: ', cdc_provisional_crdt_merged_df.cdc_cases.sum())
#print('state file known race/ethnicity: ', cdc_provisional_crdt_merged_df.cdc_known_cases.sum())
#print('state file % known race/ethnicity: ', cdc_provisional_crdt_merged_df.cdc_known_cases.sum() / cdc_provisional_crdt_merged_df.cdc_cases.sum())
#@title
cdc_crdt_fields_dict = {
'x': {'name': 'crdt_cases', 'format': ',', 'title': 'CRDT deaths'},
'y': {'name': 'cdc_cases', 'format': ',', 'title': 'CDC deaths'},
'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of CDC to CRDT'},
}
cdc_crdt_title = 'Ratio of NCHS to CRDT Deaths as of %s' % provisional_date_display_name
CreateScatterPlotAndMap(
cdc_provisional_crdt_merged_df, cdc_crdt_fields_dict, cdc_crdt_title, total_cases_scale_max, scatter_height, scatter_width, map_width, 'state', 'ratio'
).display()
#@title
cdc_nyt_provisional_fields_dict = {
'x': {'name': 'nyt_cases', 'format': ',', 'title': 'NYT deaths'},
'y': {'name': 'COVID_19_Deaths', 'format': ',', 'title': 'CDC deaths'},
'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of CDC to NYT'},
}
cdc_nyt_provisional_title = 'Ratio of NCHS to NYT Deaths as of %s' % provisional_date_display_name
scatter = CreateScatterPlotAndMap(
nyt_merged_provisional_df, cdc_nyt_provisional_fields_dict, cdc_nyt_provisional_title, 18000, scatter_height, scatter_width, map_width, 'county', 'ratio'
)
(scatter).configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=map_height - 50
).configure_mark(
stroke='grey'
).display()
In the charts and maps below, we can see that the NCHS datasets have a strikingly high percentage of deaths with known race/ethnicity. The state dataset had both counts and percentages of deaths from COVID-19 within each race/ethnicity group but not a total deaths count. There is an "Other" category that includes "More than one race or Unknown," and we treated that entire category as Unknown race/ethnicity. The percentages for all the categories including Other didn't always add up to 100%, so we treated the remaining percentage as Unknowns as well (data for a group can be suppressed if that group had fewer than 10 deaths). The county dataset included only percentages of deaths from COVID-19 within each race/ethnicity group, but it did include a total count per county so that we could calculate counts.
#@title
crdt_provisional_known_state_fields_dict = {
'x': {'name': 'crdt_known_race_cases', 'format': ',', 'title': 'Known race/ethnicity deaths'},
'y': {'name': 'crdt_cases', 'format': ',', 'title': 'CRDT deaths'},
'percent': {'name': 'crdt_known_race_cases_percent', 'format': '.0%', 'title': 'Percent known deaths'},
}
crdt_provisional_known_state_title = 'CRDT Deaths with Known Race/Ethnicity as of %s' % provisional_date_display_name
crdt_provisional_known_map = CreateMap(
cdc_provisional_crdt_merged_df, crdt_provisional_known_state_fields_dict, crdt_provisional_known_state_title, total_cases_scale_max, map_height, map_width, 'state', 'percent'
)
cdc_provisional_known_state_fields_dict = {
'x': {'name': 'cdc_known_cases', 'format': ',', 'title': 'Known race/ethnicity deaths'},
'y': {'name': 'cdc_cases', 'format': ',', 'title': 'CDC deaths'},
'percent': {'name': 'cdc_known_cases_percent', 'format': '.0%', 'title': 'Percent known deaths'},
}
cdc_provisional_known_state_title = 'NCHS Deaths with Known Race/Ethnicity as of %s' % provisional_date_display_name
cdc_provisional_known_state_map = CreateMap(
cdc_provisional_crdt_merged_df, cdc_provisional_known_state_fields_dict, cdc_provisional_known_state_title, total_cases_scale_max, map_height, map_width, 'state', 'percent'
)
(crdt_provisional_known_map | cdc_provisional_known_state_map).configure(
padding={"left": 0, "top": 5, "right": 0, "bottom": 5}
).configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=map_height - 50
).display()
#@title
fields_dict = {
'x': {'name': 'crdt_known_race_cases', 'format': ',', 'title': 'CRDT known race/ethnicity deaths'},
'y': {'name': 'cdc_known_cases', 'format': ',', 'title': 'CDC known race/ethnicity deaths'},
'percent': {'name': 'percent_known_cases', 'format': '.2f', 'title': 'Ratio of CDC to CRDT'},
}
title = 'Ratio of NCHS to CRDT Deaths with Known Race/Ethnicity'
CreateScatterPlotAndMap(
cdc_provisional_crdt_merged_df, fields_dict, title, 40000, scatter_height, scatter_width, map_width - 5, 'state', 'ratio'
).display()
#@title
cdc_known_county_provisional_fields_dict = {
'x': {'name': 'total_known_cases', 'format': ',', 'title': 'Known race/ethnicity deaths'},
'y': {'name': 'COVID_19_Deaths', 'format': ',', 'title': 'CDC deaths'},
'percent': {'name': 'percent_known_cases', 'format': '.0%', 'title': 'Percent known deaths'},
}
cdc_known_county_provisional_title = 'NCHS Deaths with Known Race/Ethnicity as of %s' % provisional_date_display_name
cdc_known_county_provisional_map = CreateMap(
nyt_merged_provisional_df, cdc_known_county_provisional_fields_dict, cdc_known_county_provisional_title, total_cases_scale_max, map_height, map_width, 'county', 'percent'
)
(cdc_known_county_provisional_map).configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=map_height - 50
).display()
How can we evaluate the completeness of race/ethnicity data at the state and county levels so that we know which data are usable?
In the Total Death Counts section above, we identified the states and counties with the biggest discrepancies relative to the total death counts from aggregate data. In the Deaths with Race/Ethnicity section, we looked at the percentage of deaths within each state and county that have race/ethnicity data.
The charts below show those two components together; the scatterplots show (1) the CDC death counts as a percentage of the CRDT/NYT total case counts on the y-axis, and (2) the percentage of CDC deaths with known race/ethnicity on the x-axis. The colors of the dots and on the map show the composite measure of completeness by multiplying those two numbers together, which is the percentage of expected total deaths that have race/ethnicity in the CDC dataset.
The scatterplots below can help us identify which states and counties have usable data and what the completeness issues are:
#@title
nyt_cdc_known_merged_df = chart_df.join(nyt_counties_df, on="county_fips", how='inner', lsuffix='_left', rsuffix='_right')
nyt_cdc_known_merged_df.reset_index(inplace=True)
nyt_cdc_known_merged_df['percent'] = round(nyt_cdc_known_merged_df.total_cases / nyt_cdc_known_merged_df.nyt_cases, 2)
#@title
crdt_merged_df['percent_max_100'] = crdt_merged_df.percent.clip(upper=1)
crdt_merged_df['percent_reccs'] = crdt_merged_df.percent_max_100 * crdt_merged_df.cdc_known_cases_percent
state_reccs_fields_dict = {
'y': {'name': 'percent_max_100', 'format': '.0%', 'title': 'CDC percent of CRDT total deaths'},
'x': {'name': 'cdc_known_cases_percent', 'format': '.0%', 'title': 'CDC percent with race/ethnicity'},
'percent': {'name': 'percent_reccs', 'format': '.0%', 'title': 'Product: CDC percent of CRDT total with race/ethnicity'},
}
state_reccs_title = 'CDC Percent of Total Deaths x Race/Ethnicity as of %s' % date_display_name
scatter = CreateScatterPlotAndMap(
crdt_merged_df, state_reccs_fields_dict, state_reccs_title, 1, scatter_height, scatter_width, map_width, 'state', 'percent'
)
scatter.configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=map_height - 50
).configure_mark(
stroke='grey'
).display()
#@title
nyt_cdc_known_merged_df['percent_max_100'] = nyt_cdc_known_merged_df.percent.clip(upper=1)
nyt_cdc_known_merged_df['percent_reccs'] = nyt_cdc_known_merged_df.percent_max_100 * nyt_cdc_known_merged_df.percent_known_cases
county_reccs_fields_dict = {
'y': {'name': 'percent_max_100', 'format': '.0%', 'title': 'CDC percent of NYT total deaths'},
'x': {'name': 'percent_known_cases', 'format': '.0%', 'title': 'CDC percent with race/ethnicity'},
'percent': {'name': 'percent_reccs', 'format': '.0%', 'title': 'Product: CDC percent of NYT total with race/ethnicity'},
}
county_reccs_title = state_reccs_title = 'CDC Percent of Total Deaths x Race/Ethnicity as of %s' % date_display_name
scatter = CreateScatterPlotAndMap(
nyt_cdc_known_merged_df, county_reccs_fields_dict, county_reccs_title, 1, scatter_height, scatter_width, map_width, 'county', 'percent'
)
scatter.configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=map_height - 50
).configure_mark(
stroke='grey'
).display()
Notes:
In the NCHS datasets, almost all states and counties are in the top right quadrant, which means that they have a mid-high percentage of total deaths and a mid-high percentage of deaths with race/ethnicity.
#@title
cdc_provisional_crdt_merged_df['percent_max_100'] = cdc_provisional_crdt_merged_df.percent.clip(upper=1)
cdc_provisional_crdt_merged_df['percent_reccs'] = cdc_provisional_crdt_merged_df.percent_max_100 * cdc_provisional_crdt_merged_df.cdc_known_cases_percent
provisional_state_reccs_fields_dict = {
'y': {'name': 'percent_max_100', 'format': '.0%', 'title': 'CDC percent of CRDT total deaths'},
'x': {'name': 'cdc_known_cases_percent', 'format': '.0%', 'title': 'CDC percent with known or suppressed race/ethnicity'},
'percent': {'name': 'percent_reccs', 'format': '.0%', 'title': 'Product: CDC percent of CRDT total with race/ethnicity'},
}
provisional_state_reccs_title = 'NCHS Percent of Total Deaths x Race/Ethnicity as of %s' % provisional_date_display_name
scatter = CreateScatterPlotAndMap(
cdc_provisional_crdt_merged_df, provisional_state_reccs_fields_dict, provisional_state_reccs_title, 1, scatter_height, scatter_width, map_width, 'state', 'percent'
)
scatter.configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=map_height - 50
).configure_mark(
stroke='grey'
).display()
#@title
nyt_merged_provisional_df['percent_max_100'] = nyt_merged_provisional_df.percent.clip(upper=1)
nyt_merged_provisional_df['percent_reccs'] = nyt_merged_provisional_df.percent_max_100 * nyt_merged_provisional_df.percent_known_cases
county_reccs_provisional_fields_dict = {
'y': {'name': 'percent_max_100', 'format': '.0%', 'title': 'CDC percent of NYT total deaths'},
'x': {'name': 'percent_known_cases', 'format': '.0%', 'title': 'CDC percent with race/ethnicity'},
'percent': {'name': 'percent_reccs', 'format': '.0%', 'title': 'Product: CDC percent of NYT total with race/ethnicity'},
}
county_reccs_provisional_title = state_reccs_title = 'NCHS Percent of Total Deaths x Race/Ethnicity as of %s' % provisional_date_display_name
scatter_provisional = CreateScatterPlotAndMap(
nyt_merged_provisional_df, county_reccs_provisional_fields_dict, county_reccs_provisional_title, 1, scatter_height, scatter_width, map_width, 'county', 'percent'
)
scatter_provisional.configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=map_height - 50
).configure_mark(
stroke='grey'
).display()
Notes:
We can get an overall measure of completeness if we look at the number of states in the top right corner of the scatterplot where the composite score is > 85% (where the dots turn dark blue) and > 50% (where the dots turn yellow).
#@title
#PrintSummaryStats(crdt_merged_df, field='percent_reccs')
#PrintSummaryStats(nyt_cdc_known_merged_df, field='percent_reccs')
#greater_than_85_df = nyt_cdc_known_merged_df[nyt_cdc_known_merged_df['percent_reccs'] > .85]
#print('total pop > 85%: ', greater_than_85_df.total_pop.sum(), greater_than_85_df.total_pop.sum() / 328239523)
#greater_than_50_df = nyt_cdc_known_merged_df[nyt_cdc_known_merged_df['percent_reccs'] > .50]
#print('total pop > 50%: ', greater_than_50_df.total_pop.sum(), greater_than_50_df.total_pop.sum() / 328239523)
#@title
#PrintSummaryStats(cdc_provisional_crdt_merged_df, field='percent_reccs')
#PrintSummaryStats(nyt_merged_provisional_df, field='percent_reccs')
#greater_than_85_df = nyt_merged_provisional_df[nyt_merged_provisional_df['percent_reccs'] > .85]
#print('total pop > 85%: %d, %f', greater_than_85_df.total_pop.sum(), greater_than_85_df.total_pop.sum() / 328239523)
#greater_than_50_df = nyt_merged_provisional_df[nyt_merged_provisional_df['percent_reccs'] > .50]
#print('total pop > 50%: %d, %f', greater_than_50_df.total_pop.sum(), greater_than_50_df.total_pop.sum() / 328239523)
#@title
# Manually update these fields based on the latest CDC data.
row_names = [
'Number of states with composite > 85%',
'(as a percent of all states)',
'Number of states with composite > 50%',
'(as a percent of all states)',
'Number of counties with composite > 85%',
'(as a percent of all counties)',
'Number of counties with composite > 50%',
'(as a percent of all counties)',
'Population in counties with composite > 85%',
'(as a % of total U.S population – States + D.C.)',
'Population in counties with composite > 50%',
'(as a % of total U.S population – States + D.C.)',
]
crdt_metadata = [
'45',
'(88%)',
'50',
'(98%)',
'-',
'-',
'-',
'-',
'-',
'-',
'-',
'-',
]
cdc_state_provisional_metadata = [
'44',
'(86%)',
'50',
'(98%)',
'-',
'-',
'-',
'-',
'-',
'-',
'-',
'-',
]
cdc_metadata = [
'8',
'(17%)',
'32',
'(67%)',
'560',
'(18%)',
'1,560',
'(50%)',
'77M',
'(24%)',
'185M',
'(56%)',
]
cdc_county_provisional_metadata = [
'-',
'-',
'-',
'-',
'519',
'(16%)',
'690',
'(22%)',
'189M',
'(58%)',
'249M',
'(76%)',
]
table_data = {'CRDT': crdt_metadata,
'CDC': cdc_metadata,
'NCHS state': cdc_state_provisional_metadata,
'NCHS county': cdc_county_provisional_metadata}
metadata_df = pd.DataFrame(table_data, index=row_names)
metadata_df.head(15)
If we require that states or counties have 85% of total expected deaths with race/ethnicity, we can use the NCHS state dataset for 86% of states vs. 17% from the CDC case dataset. We can use the NCHS county dataset for 16% of counties vs. 18% from the CDC case dataset, where the NCHS's counties account for 58% of the U.S. population vs. 24% from the CDC case dataset.
If we loosen that requirement to 50% of total expected deaths with race/ethnicity, we can use the NCHS state dataset for 98% of states vs. 67% from the CDC case dataset. We can use the NCHS county dataset for 22% of counties vs. 50% from the CDC case dataset, where the NCHS's counties account for 76% of the U.S. population vs. 56% from the CDC case dataset.
If you are looking for state- or county-level deaths data with race/ethnicity, the NCHS datasets are more complete than the CDC Case Surveillance dataset in almost every way: 98%-99% of the deaths reported have race/ethnicity, they generally do not lag far behind state website data as reported in the CRDT/NYT, and they are updated once a week rather than once a month.
However, there are a few tradeoffs with using the NCHS state dataset:
There are a few additional tradeoffs with using the NCHS county dataset:
There are two ways in which states can improve the data they report to the CDC:
When evaluating the percent of deaths that report on race/ethnicity in the CDC dataset, we also need to consider the 2% of overall deaths with race/ethnicity that were suppressed due to privacy reasons. We should give states and counties credit for reporting race/ethnicity data for those deaths even if we aren't able to use it due to privacy suppression. Below, the map on the top left shows the percent of deaths with known race/ethnicity and the map on the top right shows the percent of deaths with known or suppressed race/ethnicity. The maps below that show the same information at the county level.
#@title
cdc_known_or_na_state_fields_dict = {
'x': {'name': 'cdc_known_or_na_cases', 'format': ',', 'title': 'Known or suppressed race/ethnicity deaths'},
'y': {'name': 'cdc_cases', 'format': ',', 'title': 'CDC deaths'},
'percent': {'name': 'cdc_known_or_na_cases_percent', 'format': '.0%', 'title': 'Percent known or suppressed deaths'},
}
cdc_known_or_na_state_title = 'CDC Deaths with Known+Suppressed Race/Ethnicity as of %s' % date_display_name
cdc_known_or_na_state_map = CreateMap(
crdt_merged_df, cdc_known_or_na_state_fields_dict, cdc_known_or_na_state_title, total_cases_scale_max, map_height, map_width, 'state', 'percent'
)
(cdc_known_state_map | cdc_known_or_na_state_map).configure(
padding={"left": 0, "top": 5, "right": 0, "bottom": 5}
).configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=map_height - 50
).display()
#@title
cdc_known_county_fields_dict = {
'x': {'name': 'total_known_cases', 'format': ',', 'title': 'Known race/ethnicity deaths'},
'y': {'name': 'total_cases', 'format': ',', 'title': 'CDC deaths'},
'percent': {'name': 'percent_known_cases', 'format': '.0%', 'title': 'Percent known deaths'},
}
cdc_known_county_title = 'CDC Deaths with Known Race/Ethnicity as of %s' % date_display_name
cdc_known_county_map = CreateMap(
chart_df, cdc_known_county_fields_dict, cdc_known_county_title, total_cases_scale_max, map_height, map_width, 'county', 'percent'
)
cdc_known_or_na_county_fields_dict = {
'x': {'name': 'total_known_or_na_cases', 'format': ',', 'title': 'Known or suppressed race/ethnicity deaths'},
'y': {'name': 'total_cases', 'format': ',', 'title': 'CDC deaths'},
'percent': {'name': 'percent_known_or_na_cases', 'format': '.0%', 'title': 'Percent known or suppressed deaths'},
}
cdc_known_or_na_county_title = 'CDC Deaths with Known+Suppressed Race/Ethnicity as of %s' % date_display_name
cdc_known_or_na_county_map = CreateMap(
chart_df, cdc_known_or_na_county_fields_dict, cdc_known_or_na_county_title, total_cases_scale_max, map_height, map_width, 'county', 'percent'
)
(cdc_known_county_map | cdc_known_or_na_county_map).configure(
padding={"left": 0, "top": 5, "right": 0, "bottom": 5}
).configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=map_height - 50
).display()
Note: A larger version of the county maps for hovering over smaller counties is available in the Appendix.
We can see that race/ethnicity data is missing in some states such as California, North Dakota, Pennsylvania, and New York.
But the main way to improve the completeness of the CDC case data is for more states and counties to report whether a person died; this information is missing from most or all counties in Texas, New Mexico, South Dakota, Nebraska, Missouri, West Virginia, and Maryland.
To make it easier to hover over small counties, here are larger versions of the county maps that appeared in this report.
#@title
cdc_nyt_map = CreateMap(
nyt_merged_df, cdc_nyt_fields_dict, cdc_nyt_title, total_cases_scale_max, map_height * 2, map_width * 2, 'county', 'ratio'
).configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=map_height - 50
)
cdc_nyt_map.display()
#@title
cdc_known_county_map = CreateMap(
chart_df, cdc_known_county_fields_dict, cdc_known_county_title, total_cases_scale_max, map_height * 2, map_width * 2, 'county', 'percent'
).configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=map_height - 50
)
cdc_known_county_map.display()
#@title
cdc_known_or_na_county_map = CreateMap(
chart_df, cdc_known_or_na_county_fields_dict, cdc_known_or_na_county_title, total_cases_scale_max, map_height * 2, map_width * 2, 'county', 'percent'
).configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=map_height - 50
)
cdc_known_or_na_county_map.display()
#@title
county_completeness = CreateMap(
nyt_cdc_known_merged_df, county_reccs_fields_dict, county_reccs_title, 1, map_height * 2, map_width * 2, 'county', 'percent'
)
county_completeness.configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=map_height - 50
).configure_mark(
stroke='grey'
).display()
#@title
cdc_nyt_map = CreateMap(
nyt_merged_provisional_df, cdc_nyt_provisional_fields_dict, cdc_nyt_provisional_title, total_cases_scale_max, map_height * 2, map_width * 2, 'county', 'ratio'
)
cdc_nyt_map.configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=map_height - 50
).configure_mark(
stroke='grey'
).display()
#@title
cdc_known_county_map = CreateMap(
nyt_merged_provisional_df, cdc_known_county_provisional_fields_dict, cdc_known_county_provisional_title, total_cases_scale_max, map_height * 2, map_width * 2, 'county', 'percent'
)
cdc_known_county_map.configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=map_height - 50
).configure_mark(
stroke='grey'
).display()
#@title
map = CreateMap(
nyt_merged_provisional_df, county_reccs_provisional_fields_dict, county_reccs_provisional_title, 1, map_height * 2, map_width * 2, 'county', 'percent'
)
map.configure_view(
strokeWidth=0,
).configure_legend(
gradientLength=map_height - 50
).configure_mark(
stroke='grey'
).display()
The NCHS state dataset had separate rows for New York State and New York City; we combined them into New York State.
Please email us at shli-covid-data-analysis@googlegroups.com with questions or comments.
#%%shell
#jupyter nbconvert --to html 'cdc_death_data.ipynb'